x86: Xsave support for PV guests.
authorKeir Fraser <keir@xen.org>
Wed, 3 Nov 2010 08:15:20 +0000 (08:15 +0000)
committerKeir Fraser <keir@xen.org>
Wed, 3 Nov 2010 08:15:20 +0000 (08:15 +0000)
Signed-off-by: Shan Haitao <haitao.shan@intel.com>
Signed-off-by: Han Weidong <weidong.han@intel.com>
tools/libxc/xc_cpuid_x86.c
xen/arch/x86/domain.c
xen/arch/x86/hvm/hvm.c
xen/arch/x86/hvm/vmx/vmx.c
xen/arch/x86/i387.c
xen/arch/x86/traps.c
xen/include/asm-x86/domain.h
xen/include/asm-x86/hvm/vcpu.h
xen/include/asm-x86/i387.h

index e517eb7733112227abd2b845f101307dfe613f7d..68e99a17389751f41d18c64151639aedc744d874 100644 (file)
@@ -323,7 +323,6 @@ static void xc_cpuid_pv_policy(
         clear_bit(X86_FEATURE_XTPR, regs[2]);
         clear_bit(X86_FEATURE_PDCM, regs[2]);
         clear_bit(X86_FEATURE_DCA, regs[2]);
-        clear_bit(X86_FEATURE_XSAVE, regs[2]);
         set_bit(X86_FEATURE_HYPERVISOR, regs[2]);
         break;
     case 0x80000001:
index b67ddd6a00874d77ceb9899add75308ab4b629eb..7356efff3f90461ef89857f2aa8b224a74e643a9 100644 (file)
@@ -343,10 +343,26 @@ int vcpu_initialise(struct vcpu *v)
 
     paging_vcpu_init(v);
 
+    if ( cpu_has_xsave )
+    {
+        /* XSAVE/XRSTOR requires the save area be 64-byte-boundary aligned. */
+        void *xsave_area = _xmalloc(xsave_cntxt_size, 64);
+        if ( xsave_area == NULL )
+            return -ENOMEM;
+
+        xsave_init_save_area(xsave_area);
+        v->arch.xsave_area = xsave_area;
+        v->arch.xcr0 = XSTATE_FP_SSE;
+        v->arch.xcr0_accum = XSTATE_FP_SSE;
+    }
+
     if ( is_hvm_domain(d) )
     {
         if ( (rc = hvm_vcpu_initialise(v)) != 0 )
+        {
+            xfree(v->arch.xsave_area);
             return rc;
+        }
     }
     else
     {
@@ -376,7 +392,13 @@ int vcpu_initialise(struct vcpu *v)
 
     spin_lock_init(&v->arch.shadow_ldt_lock);
 
-    return (is_pv_32on64_vcpu(v) ? setup_compat_l4(v) : 0);
+    rc = 0;
+    if ( is_pv_32on64_vcpu(v) )
+        rc = setup_compat_l4(v);
+    if ( !rc )
+        xfree(v->arch.xsave_area);
+
+    return rc;
 }
 
 void vcpu_destroy(struct vcpu *v)
@@ -384,6 +406,8 @@ void vcpu_destroy(struct vcpu *v)
     if ( is_pv_32on64_vcpu(v) )
         release_compat_l4(v);
 
+    xfree(v->arch.xsave_area);
+
     if ( is_hvm_vcpu(v) )
         hvm_vcpu_destroy(v);
 }
@@ -592,6 +616,8 @@ unsigned long pv_guest_cr4_fixup(const struct vcpu *v, unsigned long guest_cr4)
         hv_cr4_mask &= ~X86_CR4_DE;
     if ( cpu_has_fsgsbase && !is_pv_32bit_domain(v->domain) )
         hv_cr4_mask &= ~X86_CR4_FSGSBASE;
+    if ( cpu_has_xsave )
+        hv_cr4_mask &= ~X86_CR4_OSXSAVE;
 
     if ( (guest_cr4 & hv_cr4_mask) != (hv_cr4 & hv_cr4_mask) )
         gdprintk(XENLOG_WARNING,
@@ -1367,6 +1393,8 @@ static void __context_switch(void)
         memcpy(stack_regs,
                &n->arch.guest_context.user_regs,
                CTXT_SWITCH_STACK_BYTES);
+        if ( cpu_has_xsave && n->arch.xcr0 != get_xcr0() )
+            set_xcr0(n->arch.xcr0);
         n->arch.ctxt_switch_to(n);
     }
 
index 45afb530d60852e9b2c9bda579b79b8d6e448929..43d907a2ee608052ad1664224d3d77122ed02162 100644 (file)
@@ -805,18 +805,6 @@ int hvm_vcpu_initialise(struct vcpu *v)
 
     hvm_asid_flush_vcpu(v);
 
-    if ( cpu_has_xsave )
-    {
-        /* XSAVE/XRSTOR requires the save area be 64-byte-boundary aligned. */
-        void *xsave_area = _xmalloc(xsave_cntxt_size, 64);
-        if ( xsave_area == NULL )
-            return -ENOMEM;
-
-        xsave_init_save_area(xsave_area);
-        v->arch.hvm_vcpu.xsave_area = xsave_area;
-        v->arch.hvm_vcpu.xcr0 = XSTATE_FP_SSE;
-    }
-
     if ( (rc = vlapic_init(v)) != 0 )
         goto fail1;
 
@@ -879,7 +867,6 @@ void hvm_vcpu_destroy(struct vcpu *v)
     hvm_vcpu_cacheattr_destroy(v);
     vlapic_destroy(v);
     hvm_funcs.vcpu_destroy(v);
-    xfree(v->arch.hvm_vcpu.xsave_area);
 
     /* Event channel is already freed by evtchn_destroy(). */
     /*free_xen_event_channel(v, v->arch.hvm_vcpu.xen_port);*/
index 24c2331f472df059a4257bfa4ecf0b17db934cc2..e24f0093e4b9bc432f6a5aae55921fe2de341497 100644 (file)
@@ -652,10 +652,7 @@ static void vmx_ctxt_switch_to(struct vcpu *v)
     struct domain *d = v->domain;
     unsigned long old_cr4 = read_cr4(), new_cr4 = mmu_cr4_features;
 
-    /* HOST_CR4 in VMCS is always mmu_cr4_features and
-     * CR4_OSXSAVE(if supported). Sync CR4 now. */
-    if ( cpu_has_xsave )
-        new_cr4 |= X86_CR4_OSXSAVE;
+    /* HOST_CR4 in VMCS is always mmu_cr4_features. Sync CR4 now. */
     if ( old_cr4 != new_cr4 )
         write_cr4(new_cr4);
 
@@ -2215,7 +2212,8 @@ static int vmx_handle_xsetbv(u64 new_bv)
     if ( (xfeature_mask & XSTATE_YMM & new_bv) && !(new_bv & XSTATE_SSE) )
         goto err;
 
-    v->arch.hvm_vcpu.xcr0 = new_bv;
+    v->arch.xcr0 = new_bv;
+    v->arch.xcr0_accum |= new_bv;
     set_xcr0(new_bv);
     return 0;
 err:
index fa16fa9c5c51578d9a7dbf2cd60bb4de4fcb470d..27b49234ede8abdd4b02f9197e4bb9f21752e655 100644 (file)
@@ -33,9 +33,14 @@ void save_init_fpu(struct vcpu *v)
     if ( cr0 & X86_CR0_TS )
         clts();
 
-    if ( cpu_has_xsave && is_hvm_vcpu(v) )
+    if ( cpu_has_xsave )
     {
+        /* XCR0 normally represents what guest OS set. In case of Xen itself,
+         * we set all accumulated feature mask before doing save/restore.
+         */
+        set_xcr0(v->arch.xcr0_accum);
         xsave(v);
+        set_xcr0(v->arch.xcr0);
     }
     else if ( cpu_has_fxsr )
     {
@@ -144,6 +149,9 @@ u32 xsave_cntxt_size;
 /* A 64-bit bitmask of the XSAVE/XRSTOR features supported by processor. */
 u64 xfeature_mask;
 
+/* Cached xcr0 for fast read */
+DEFINE_PER_CPU(uint64_t, xcr0);
+
 void xsave_init(void)
 {
     u32 eax, ebx, ecx, edx;
@@ -171,13 +179,11 @@ void xsave_init(void)
     BUG_ON(ecx < min_size);
 
     /*
-     * We will only enable the features we know for hvm guest. Here we use
-     * set/clear CR4_OSXSAVE and re-run cpuid to get xsave_cntxt_size.
+     * Set CR4_OSXSAVE and run "cpuid" to get xsave_cntxt_size.
      */
     set_in_cr4(X86_CR4_OSXSAVE);
     set_xcr0(eax & XCNTXT_MASK);
     cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
-    clear_in_cr4(X86_CR4_OSXSAVE);
 
     if ( cpu == 0 )
     {
index fa3f48ebb8ce42200cff334b7b8d800a27f9bc23..f91e3c8ae3b67dcbf41e001c3e17ade0d9b83aed 100644 (file)
@@ -795,7 +795,6 @@ static void pv_cpuid(struct cpu_user_regs *regs)
         __clear_bit(X86_FEATURE_XTPR % 32, &c);
         __clear_bit(X86_FEATURE_PDCM % 32, &c);
         __clear_bit(X86_FEATURE_DCA % 32, &c);
-        __clear_bit(X86_FEATURE_XSAVE % 32, &c);
         if ( !cpu_has_apic )
            __clear_bit(X86_FEATURE_X2APIC % 32, &c);
         __set_bit(X86_FEATURE_HYPERVISOR % 32, &c);
@@ -1715,7 +1714,7 @@ static int emulate_privileged_op(struct cpu_user_regs *regs)
     enum { lm_seg_none, lm_seg_fs, lm_seg_gs } lm_ovr = lm_seg_none;
     int rc;
     unsigned int port, i, data_sel, ar, data, bpmatch = 0;
-    unsigned int op_bytes, op_default, ad_bytes, ad_default;
+    unsigned int op_bytes, op_default, ad_bytes, ad_default, opsize_prefix= 0;
 #define rd_ad(reg) (ad_bytes >= sizeof(regs->reg) \
                     ? regs->reg \
                     : ad_bytes == 4 \
@@ -1751,6 +1750,7 @@ static int emulate_privileged_op(struct cpu_user_regs *regs)
         switch ( opcode = insn_fetch(u8, code_base, eip, code_limit) )
         {
         case 0x66: /* operand-size override */
+            opsize_prefix = 1;
             op_bytes = op_default ^ 6; /* switch between 2/4 bytes */
             continue;
         case 0x67: /* address-size override */
@@ -2051,13 +2051,48 @@ static int emulate_privileged_op(struct cpu_user_regs *regs)
         goto fail;
     switch ( opcode )
     {
-    case 0x1: /* RDTSCP */
-        if ( (v->arch.guest_context.ctrlreg[4] & X86_CR4_TSD) &&
-             !guest_kernel_mode(v, regs) )
-            goto fail;
-        if ( insn_fetch(u8, code_base, eip, code_limit) != 0xf9 )
+    case 0x1: /* RDTSCP and XSETBV */
+        switch ( insn_fetch(u8, code_base, eip, code_limit) )
+        {
+        case 0xf9: /* RDTSCP */
+            if ( (v->arch.guest_context.ctrlreg[4] & X86_CR4_TSD) &&
+                 !guest_kernel_mode(v, regs) )
+                goto fail;
+            pv_soft_rdtsc(v, regs, 1);
+            break;
+        case 0xd1: /* XSETBV */
+        {
+            u64 new_xfeature = (u32)regs->eax | ((u64)regs->edx << 32);
+
+            if ( lock || rep_prefix || opsize_prefix
+                 || !(v->arch.guest_context.ctrlreg[4] & X86_CR4_OSXSAVE) )
+            {
+                do_guest_trap(TRAP_invalid_op, regs, 0);
+                goto skip;
+            }
+
+            if ( !guest_kernel_mode(v, regs) )
+                goto fail;
+
+            switch ( (u32)regs->ecx )
+            {
+                case XCR_XFEATURE_ENABLED_MASK:
+                    /* bit 0 of XCR0 must be set and reserved bit must not be set */
+                    if ( !(new_xfeature & XSTATE_FP) || (new_xfeature & ~xfeature_mask) )
+                        goto fail;
+
+                    v->arch.xcr0 = new_xfeature;
+                    v->arch.xcr0_accum |= new_xfeature;
+                    set_xcr0(new_xfeature);
+                    break;
+                default:
+                    goto fail;
+            }
+            break;
+        }
+        default:
             goto fail;
-        pv_soft_rdtsc(v, regs, 1);
+        }
         break;
 
     case 0x06: /* CLTS */
index 7549126504575b348a3931dd0b6f4058d9bd8572..15b050ac0e472e521e1bc6dbea7d766c4fd0e5b0 100644 (file)
@@ -400,6 +400,23 @@ struct arch_vcpu
     pagetable_t monitor_table;          /* (MFN) hypervisor PT (for HVM) */
     unsigned long cr3;                  /* (MA) value to install in HW CR3 */
 
+    /*
+     * The save area for Processor Extended States and the bitmask of the
+     * XSAVE/XRSTOR features. They are used by: 1) when a vcpu (which has
+     * dirtied FPU/SSE) is scheduled out we XSAVE the states here; 2) in
+     * #NM handler, we XRSTOR the states we XSAVE-ed;
+     */
+    void *xsave_area;
+    uint64_t xcr0;
+    /* Accumulated eXtended features mask for using XSAVE/XRESTORE by Xen
+     * itself, as we can never know whether guest OS depends on content
+     * preservation whenever guest OS clears one feature flag (for example,
+     * temporarily).
+     * However, processor should not be able to touch eXtended states before
+     * it explicitly enables it via xcr0.
+     */
+    uint64_t xcr0_accum;
+
     /* Current LDT details. */
     unsigned long shadow_ldt_mapcnt;
     spinlock_t shadow_ldt_lock;
@@ -435,7 +452,8 @@ unsigned long pv_guest_cr4_fixup(const struct vcpu *, unsigned long guest_cr4);
 #define pv_guest_cr4_to_real_cr4(v)                         \
     (((v)->arch.guest_context.ctrlreg[4]                    \
       | (mmu_cr4_features & (X86_CR4_PGE | X86_CR4_PSE))    \
-      | ((v)->domain->arch.vtsc ? X86_CR4_TSD : 0))         \
+      | ((v)->domain->arch.vtsc ? X86_CR4_TSD : 0)         \
+      | ((cpu_has_xsave)? X86_CR4_OSXSAVE : 0))              \
       & ~X86_CR4_DE)
 #define real_cr4_to_pv_guest_cr4(c) \
     ((c) & ~(X86_CR4_PGE | X86_CR4_PSE | X86_CR4_TSD | X86_CR4_OSXSAVE))
index 53ef98320f98dcdc15ca232b3172dcd88ef14da3..1d72ecfc609bb9c668acb47331f44d0abda567d1 100644 (file)
@@ -49,15 +49,6 @@ struct hvm_vcpu {
      */
     unsigned long       hw_cr[5];
 
-    /*
-     * The save area for Processor Extended States and the bitmask of the
-     * XSAVE/XRSTOR features. They are used by: 1) when a vcpu (which has
-     * dirtied FPU/SSE) is scheduled out we XSAVE the states here; 2) in
-     * #NM handler, we XRSTOR the states we XSAVE-ed;
-     */
-    void *xsave_area;
-    uint64_t xcr0;
-
     struct vlapic       vlapic;
     s64                 cache_tsc_offset;
     u64                 guest_time;
index ba365c0b3caa04fff0ab655866f26277802857db..ba773e99edc9baa150bf5415f2156fd37072bbf5 100644 (file)
@@ -49,6 +49,8 @@ struct xsave_struct
 #define REX_PREFIX
 #endif
 
+DECLARE_PER_CPU(uint64_t, xcr0);
+
 static inline void xsetbv(u32 index, u64 xfeatures)
 {
     u32 hi = xfeatures >> 32;
@@ -60,14 +62,20 @@ static inline void xsetbv(u32 index, u64 xfeatures)
 
 static inline void set_xcr0(u64 xfeatures)
 {
+    this_cpu(xcr0) = xfeatures;
     xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures);
 }
 
+static inline uint64_t get_xcr0(void)
+{
+    return this_cpu(xcr0);
+}
+
 static inline void xsave(struct vcpu *v)
 {
     struct xsave_struct *ptr;
 
-    ptr =(struct xsave_struct *)v->arch.hvm_vcpu.xsave_area;
+    ptr =(struct xsave_struct *)v->arch.xsave_area;
 
     asm volatile (".byte " REX_PREFIX "0x0f,0xae,0x27"
         :
@@ -79,7 +87,7 @@ static inline void xrstor(struct vcpu *v)
 {
     struct xsave_struct *ptr;
 
-    ptr =(struct xsave_struct *)v->arch.hvm_vcpu.xsave_area;
+    ptr =(struct xsave_struct *)v->arch.xsave_area;
 
     asm volatile (".byte " REX_PREFIX "0x0f,0xae,0x2f"
         :
@@ -108,14 +116,18 @@ static inline void setup_fpu(struct vcpu *v)
     if ( !v->fpu_dirtied )
     {
         v->fpu_dirtied = 1;
-        if ( cpu_has_xsave && is_hvm_vcpu(v) )
+        if ( cpu_has_xsave )
         {
             if ( !v->fpu_initialised )
                 v->fpu_initialised = 1;
 
-            set_xcr0(v->arch.hvm_vcpu.xcr0 | XSTATE_FP_SSE);
+            /* XCR0 normally represents what guest OS set. In case of Xen
+             * itself, we set all supported feature mask before doing
+             * save/restore.
+             */
+            set_xcr0(v->arch.xcr0_accum);
             xrstor(v);
-            set_xcr0(v->arch.hvm_vcpu.xcr0);
+            set_xcr0(v->arch.xcr0);
         }
         else
         {